New chunk Ctrl+Alt+I
Execute chunk Ctrl+Shift+Enter
Execute all chunks Ctrl+Alt+R
HTML preview Ctrl+Shift+K
library(readr)
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(tidyverse)
── Attaching core tidyverse packages ────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2 ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(ggplot2)
library(reshape2)
Attaching package: ‘reshape2’
The following object is masked from ‘package:tidyr’:
smiths
library(stats)
library(tm)
Loading required package: NLP
Attaching package: ‘NLP’
The following object is masked from ‘package:ggplot2’:
annotate
library(text2vec)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
library(textstem)
Loading required package: koRpus.lang.en
Loading required package: koRpus
Loading required package: sylly
For information on available language packages for 'koRpus', run
available.koRpus.lang()
and see ?install.koRpus.lang()
Attaching package: ‘koRpus’
The following object is masked from ‘package:tm’:
readTagged
The following object is masked from ‘package:readr’:
tokenize
library(syuzhet)
data_posts <- read.csv("~/4year/2semester/dtII/CSVs/HEIs.csv",
colClasses = c(tweet_id = "character"))
# Modifying created_at type so that attribute can be used more easily
data_posts$created_at <- as.POSIXct(data_posts$created_at,
format= "%Y-%m-%dT%H:%M:%S", tz="UTC")
#View(data)
summary(data_posts)
id tweet_id text type bookmark_count favorite_count retweet_count reply_count
Length:11728 Length:11728 Length:11728 Length:11728 Min. : 0.000 Min. : 0.00 Min. : 0.00 Min. : 0.000
Class :character Class :character Class :character Class :character 1st Qu.: 0.000 1st Qu.: 7.00 1st Qu.: 2.00 1st Qu.: 0.000
Mode :character Mode :character Mode :character Mode :character Median : 0.000 Median : 20.00 Median : 5.00 Median : 1.000
Mean : 1.543 Mean : 60.67 Mean : 10.62 Mean : 3.888
3rd Qu.: 1.000 3rd Qu.: 57.00 3rd Qu.: 11.00 3rd Qu.: 3.000
Max. :418.000 Max. :41655.00 Max. :4214.00 Max. :2317.000
view_count created_at hashtags urls media_type media_urls
Min. : 5 Min. :2022-08-01 03:05:11.00 Length:11728 Length:11728 Length:11728 Length:11728
1st Qu.: 2643 1st Qu.:2022-10-19 12:56:27.00 Class :character Class :character Class :character Class :character
Median : 6240 Median :2023-01-29 08:26:30.00 Mode :character Mode :character Mode :character Mode :character
Mean : 14182 Mean :2023-01-30 07:39:34.96
3rd Qu.: 16058 3rd Qu.:2023-05-05 14:16:43.25
Max. :7604544 Max. :2023-08-31 20:50:01.00
NA's :4840
# Count of how many entries each HEI has
number_posts <- data_posts %>%
group_by(id) %>% summarise(count = n())
number_posts
data_posts <- data_posts[data_posts$id != "complutense.csv", ]
number_posts <- data_posts %>%
group_by(id) %>% summarise(posts = n())
number_tweets <- data_posts[data_posts$type == "Tweet", ] %>%
group_by(id) %>% summarise(tweets = n())
number_replies <- data_posts[data_posts$type == "Reply", ] %>%
group_by(id) %>% summarise(replies = n())
print(number_posts)
print(number_tweets)
print(number_replies)
# Merging the counts of tweets and replies with the count of posts
data_ratio <- merge(number_posts, number_tweets, by = "id", all = TRUE)
data_ratio <- merge(data_ratio, number_replies, by = "id", all = TRUE)
data_ratio$percentage_tweets <- round(((data_ratio$tweets / data_ratio$posts) * 100), 2)
data_ratio$percentage_replies <- round(((data_ratio$replies / data_ratio$posts) * 100), 2)
data_ratio <- data_ratio[, c("id", "percentage_tweets", "percentage_replies")]
data_ratio$percentage_replies[is.na(data_ratio$percentage_replies)] <- 0
print(data_ratio)
na_count <- function(){
# Counting the number of NA values for each column
na_count <- colSums(is.na(data_posts))
# Creating a new data frame with the NA counts
na_counts_table <- data.frame(Column = names(na_count), NA_Count = na_count)
print(na_counts_table)
}
data_posts <- data_posts %>%
group_by(id) %>%
mutate(view_percentile = ntile(view_count, 100),
favorite_percentile = ntile(favorite_count, 100),
retweet_percentile = ntile(retweet_count, 100),
reply_percentile = ntile(reply_count, 100)) %>%
rowwise() %>%
mutate(avg_percentile = round(mean(c(view_percentile, favorite_percentile, retweet_percentile, reply_percentile), na.rm = TRUE), 2))
na_count()
data_percentile <- data_posts[, c("id", "view_percentile", "favorite_percentile", "retweet_percentile", "reply_percentile", "avg_percentile")]
print(data_percentile)
max_view_counts <- tapply(data_posts$view_count, data_posts$id, max, na.rm = TRUE)
print(max_view_counts)
duke.csv epfl.csv goe.csv harvard.csv leicester.csv manchester.csv mit.csv sb.csv stanford.csv trinity.csv
307969 105095 15455 2982704 47838 317086 7604544 607498 222593 205333
wv.csv yale.csv
109265 143108
# From view count
data_posts$view_count <- ifelse(
is.na(data_posts$view_count),
round(max_view_counts[data_posts$id] * (data_posts$avg_percentile / 100)),
data_posts$view_count)
# From view percentile
data_posts$view_percentile <- ifelse(
is.na(data_posts$view_percentile),
data_posts$avg_percentile,
data_posts$view_percentile)
na_count()
average_posts <- function(timeframe){
# Calculation of the timeframe between earliest and latest post for each HEI
date_range <- data_posts %>%
group_by(id) %>%
summarise(min_date = min(created_at),
max_date = max(created_at)) %>%
mutate(num_days = as.numeric(difftime(max_date, min_date, units = timeframe)))
# Naming the column respecting the timeframe
column_name <- paste0("avg_posts_per_", timeframe)
# Calculation of the number of posts per day for each HEI
posts_per_timeframe <- number_posts %>%
left_join(date_range, by = "id") %>%
mutate(!!column_name := round((posts / num_days), 2))
print(posts_per_timeframe)
return(posts_per_timeframe)
}
posts_per_day <- average_posts("days")
posts_per_week <- average_posts("weeks")
barplot(posts_per_day$avg_posts_per_days,
names.arg = posts_per_day$id,
main = "Average Posts per Day",
xlab = "HEI",
ylab = "Average Number of Posts",
ylim = c(0, max(posts_per_day$avg_posts_per_days) + 1),
las = 2,
col = "#3498DB")
# Adding text labels over each bar and aligning it with the center of each bar
text(x = barplot(posts_per_day$avg_posts_per_days, plot = FALSE),
y = posts_per_day$avg_posts_per_days,
labels = round(posts_per_day$avg_posts_per_days, 2),
pos = 3)
barplot(posts_per_week$avg_posts_per_weeks,
names.arg = posts_per_week$id,
main = "Average Posts per Week",
xlab = "HEI",
ylab = "Average Number of Posts",
ylim = c(0, max(posts_per_week$avg_posts_per_weeks) + 5),
las = 2,
col = "#E74C3C")
text(x = barplot(posts_per_week$avg_posts_per_weeks, plot = FALSE),
y = posts_per_week$avg_posts_per_weeks,
labels = round(posts_per_week$avg_posts_per_weeks, 2),
pos = 3)
intervals <- list(
interval1 = as.POSIXct(c("2022-08-31", "2022-12-15")),
interval2 = as.POSIXct(c("2023-01-04", "2023-04-01")),
interval3 = as.POSIXct(c("2023-04-14", "2023-06-15"))
)
check_interval <- function(date) {
for (i in 1:length(intervals)) {
interval_start <- intervals[[i]][1]
interval_end <- intervals[[i]][2]
if (date >= interval_start & date <= interval_end) {
return(TRUE)
}
}
return(FALSE)
}
data_posts$academic_year <- sapply(data_posts$created_at, check_interval)
print(data.frame(id = data_posts$id, academic_year = data_posts$academic_year))
analyze_posts <- function(academic_year_filter) {
# Filtering the data based on the academic_year_filter
filtered_data <- data_posts %>%
filter(academic_year == academic_year_filter)
# Count of days for each HEI
unique_days <- filtered_data %>%
group_by(id) %>%
summarise(unique_days = n_distinct(as.Date(created_at)))
# Count of posts for each HEI
number_posts_boolean <- filtered_data %>%
group_by(id) %>%
summarise(count = n())
# Naming the column respecting the time period
time <- ifelse(academic_year_filter, "academic_time", "vacation_time")
column_name <- paste0("avg_posts_in_", time)
# Combination of data and calculation of average posts per day
combined_data <- left_join(unique_days, number_posts_boolean, by = "id")
combined_data <- combined_data %>%
mutate(!!column_name := round((count / unique_days), 2))
print(combined_data)
return(combined_data)
}
data_posts_academic <- analyze_posts(TRUE)
data_posts_vacations <- analyze_posts(FALSE)
barplot(data_posts_academic$avg_posts_in_academic_time,
names.arg = data_posts_academic$id,
main = "Average Posts during Academic Time",
xlab = "HEI",
ylab = "Average Number of Posts",
ylim = c(0, max(data_posts_academic$avg_posts_in_academic_time) + 5),
las = 2,
col = "#34495E")
text(x = barplot(data_posts_academic$avg_posts_in_academic_time, plot = FALSE),
y = data_posts_academic$avg_posts_in_academic_time,
labels = round(data_posts_academic$avg_posts_in_academic_time, 2),
pos = 3)
barplot(data_posts_vacations$avg_posts_in_vacation_time,
names.arg = data_posts_vacations$id,
main = "Average Posts during Vacation Time",
xlab = "HEI",
ylab = "Average Number of Posts",
ylim = c(0, max(data_posts_vacations$avg_posts_in_vacation_time) + 5),
las = 2,
col = "#D35400")
text(x = barplot(data_posts_vacations$avg_posts_in_vacation_time, plot = FALSE),
y = data_posts_vacations$avg_posts_in_vacation_time,
labels = round(data_posts_vacations$avg_posts_in_vacation_time, 2),
pos = 3)
# Creating new table that contains a new column for the day of the week
data_posts_days <- data_posts %>%
mutate(day_of_week = weekdays(created_at))
# Selecting only the id, created_at, and day_of_week columns for the new table
data_posts_days <- data_posts_days %>%
select(id, created_at, day_of_week)
# Create column hour from created_at
data_posts_days$created_hour <- as.numeric(format(data_posts_days$created_at, "%H"))
print(data_posts_days)
# Grouping by id and day_of_week, then counting the number of posts
number_posts_days <- data_posts_days %>%
group_by(id, day_of_week) %>%
summarise(count = n())
`summarise()` has grouped output by 'id'. You can override using the `.groups` argument.
# Grouping by id, day_of_week and day created at, then counting th enumber of tweets
number_posts_per_day <- data_posts_days %>%
mutate(created_date = as.Date(created_at)) %>%
group_by(id, day_of_week, created_date) %>%
summarize(count = n())
`summarise()` has grouped output by 'id', 'day_of_week'. You can override using the `.groups` argument.
# Finding for each HEI the average count of posts per day
average_number_posts_per_day <- number_posts_per_day %>%
group_by(id, day_of_week) %>%
summarise(average_count = round(mean(count), 2))
`summarise()` has grouped output by 'id'. You can override using the `.groups` argument.
print(number_posts_days)
# Finding the HEI with the lowest count of posts per day
lowest_count <- number_posts_days %>%
group_by(day_of_week) %>%
slice_min(order_by = count) %>%
select(day_of_week, id, count)
# Finding the HEI with the highest count of posts per day
highest_count <- number_posts_days %>%
group_by(day_of_week) %>%
slice_max(order_by = count) %>%
select(day_of_week, id, count)
# Combine the results
high_low_HEI <- bind_rows(lowest_count, highest_count) %>%
arrange(day_of_week)
print(high_low_HEI)
ggplot(high_low_HEI, aes(x = day_of_week, y = count, fill = id)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = count),
position = position_dodge(width = 0.9),
vjust = -0.5,
size = 3) +
labs(title = "Highest and Lowest Count of Posts per Day for Each Day of the Week",
x = "Day of the Week", y = "Count") +
scale_fill_manual(values = rainbow(length(unique(high_low_HEI$id)))) +
theme_minimal() +
theme(legend.title = element_blank())
# Finding the HEI with lowest and highest averaged count of posts per day
high_low_average_HEIs <- average_number_posts_per_day %>%
group_by(day_of_week) %>%
filter(average_count == max(average_count) | average_count == min(average_count)) %>%
arrange(day_of_week, ifelse(average_count == min(average_count), average_count, -average_count))
print(high_low_average_HEIs)
ggplot(high_low_average_HEIs, aes(x = day_of_week, y = average_count, fill = id)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = round(average_count, 2)),
position = position_dodge(width = 0.7),
vjust = -0.5,
size = 3) +
labs(title = "Highest and Lowest Average Count of Posts per Day for Each Day of the Week",
x = "Day of the Week", y = "Average Count") +
scale_fill_manual(values = rainbow(length(unique(high_low_HEI$id)))) +
theme_minimal() +
theme(legend.title = element_blank())
favourite_day_hei <- number_posts_days %>%
group_by(id) %>%
top_n(1, count) %>%
arrange(id)
print(favourite_day_hei)
# Calculating the number of posts per hour
number_posts_hours <- data_posts_days %>%
group_by(id, created_hour) %>%
summarise(count = n()) %>%
ungroup()
`summarise()` has grouped output by 'id'. You can override using the `.groups` argument.
# Identifying the favorite hour for each HEI
favourite_hour_hei <- number_posts_hours %>%
group_by(id) %>%
top_n(1, count) %>%
arrange(id)
# Adding new columns to categorize the hours and assign numerical value for clustering
favourite_hour_hei <- favourite_hour_hei %>%
mutate(time_of_day = case_when(
created_hour >= 5 & created_hour < 12 ~ "Morning",
created_hour >= 12 & created_hour < 18 ~ "Afternoon",
TRUE ~ "Night"
),
time_of_day_value = case_when(
created_hour >= 5 & created_hour < 12 ~ 1,
created_hour >= 12 & created_hour < 18 ~ 3,
TRUE ~ 5
))
print(favourite_hour_hei)
heatmap_maker <- function(target_id){
# Filtering data for the specific HEI
target_data <- data_posts_days %>%
filter(id == target_id)
# Grouping by day of the week and hour, and counting the number of tweets
tweet_counts <- target_data %>%
group_by(day_of_week, created_hour) %>%
summarise(num_posts = n())
# Plotting heatmap
ggplot(tweet_counts, aes(x = day_of_week, y = created_hour, fill = num_posts)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "blue") +
labs(title = paste("Post Heatmap for", target_id),
x = "Day of the week",
y = "Hour of the day")
}
heatmap_maker("duke.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("epfl.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("goe.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("harvard.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("leicester.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("manchester.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("mit.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("sb.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("stanford.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("trinity.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("wv.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
heatmap_maker("yale.csv")
`summarise()` has grouped output by 'day_of_week'. You can override using the `.groups` argument.
# Transforming empty strings into NA
data_posts$urls[data_posts$urls == ""] <- NA
# Table with number of post, number of NA and url percentage of usage
url_usage <- data_posts %>%
group_by(id) %>%
summarise(count = n(),
na = sum(is.na(urls)),
url_percentage = round(((count - na) / count * 100), 2))
print(url_usage)
barplot(url_usage$url_percentage,
names.arg = url_usage$id,
main = "Urls Percentage for Each HEI",
xlab = "HEI",
ylab = "Urls Percentage",
ylim = c(0, max(url_usage$url_percentage) + 10),
las = 2,
col= "#8E44AD")
text(x = barplot(url_usage$url_percentage, plot = FALSE),
y = url_usage$url_percentage,
labels = round(url_usage$url_percentage, 2),
pos = 3)
data_posts_content <- data_posts %>%
select(id, text)
# Counting number of words
data_posts_content <- data_posts_content %>%
mutate(num_words = lengths(strsplit(text, "\\s+")))
# Grouping by HEI and calculate average, minimum, and maximum values of number of words
data_posts_content_metrics <- data_posts_content %>%
group_by(id) %>%
summarise(average_num_words = mean(num_words),
min_num_words = min(num_words),
max_num_words = max(num_words))
print(data_posts_content_metrics)
ggplot(data_posts_content_metrics, aes(x = id, y = average_num_words)) +
geom_point(aes(color = "Average")) +
geom_errorbar(aes(ymin = min_num_words, ymax = max_num_words, color = "Range"), width = 0.2) +
scale_color_manual(values = c("Average" = "#1976D2", "Range" = "#EF5350")) +
labs(title = "Word Count Summary by HEI",
x = "HEI",
y = "Number of Words",
color = "Metric") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
cleanup <- function(docs, spec.words=NULL){
# lowercase
docs <- tm_map(docs, content_transformer(tolower))
# rm numbers
docs <- tm_map(docs, removeNumbers)
# rm english common stopWords
docs <- tm_map(docs, removeWords, stopwords("english"))
# if stopwords are specified as a character vector
if(!is.null(spec.words))
docs <- tm_map(docs, removeWords, spec.words)
# rm punctuations
docs <- tm_map(docs, removePunctuation)
# rm extra white spaces
docs <- tm_map(docs, stripWhitespace)
# lemmatizing text
docs <- tm_map(docs, lemmatize_words)
docs
}
pairing <- function(word_dict, paired_words) {
if (nrow(word_dict) != length(paired_words)) {
stop("The number of rows in word_dict and the length of paired_words should be equal.")
}
word_dict <- tibble::tibble(word = word_dict$value)
result <- tibble::tibble(word = word_dict$word, category = paired_words)
return(result)
}
classify_text <- function(text, word_pair) {
# Tokenizing the text and converting to lowercase
words <- tolower(unlist(strsplit(text, "\\W+")))
# Finding the frequent words in the text
freq_words <- words[words %in% word_pair$word]
if (length(freq_words) == 0) {
return("Unknown") # Returning Unknown if no frequent words are found
}
# Getting the corresponding categories
categories <- word_pair$category[word_pair$word %in% freq_words]
# Sorting categories by frequency in descending order and return the most frequent one
return(names(sort(table(categories), decreasing = TRUE))[1])
}
corpus_maker <- function(text) {
texts <- text$text
vc <- VectorSource(texts)
corpus <- Corpus(vc)
return(corpus)
}
freq_terms <- function(clean_text, number) {
dtm <- DocumentTermMatrix(clean_text)
dtm.tfidf <- weightTfIdf(dtm)
mdf <- as_tibble(as.matrix(dtm.tfidf))
mdf.freq <- mdf %>%
select(findFreqTerms(dtm, number)) %>%
summarise_all(sum) %>%
gather() %>%
arrange(desc(value))
mdf.freq$key <-
factor(mdf.freq$key,
levels = mdf.freq$key[order(mdf.freq$value)])
word_dictionary <- as_tibble(mdf.freq$key)
ggplot(mdf.freq, aes(x=key, y=value)) +
geom_bar(stat="identity") +
labs(x="terms", y="freq") + coord_flip()
return(word_dictionary)
}
duke_text <- subset(data_posts_content, id == "duke.csv")
duke_corpus <- corpus_maker(duke_text)
duke_clean_text <-cleanup(duke_corpus, c("new", "will", "change", "north", "can", "first", "year", "carolina", "years", "summer", "meet", "one"))
Warning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documents
duke_word_dictionary <- freq_terms(duke_clean_text, 30)
duke_word_category <- c("Image", "Education", "Education", "Image", "Image", "Research", "Research", "Image", "Image", "Engagement", "Research", "Research", "Image", "Education", "Education", "Society", "Education", "Engagement")
duke_word_pair <- pairing(duke_word_dictionary, duke_word_category)
print(duke_word_pair)
# Applying the classify_text function to each text
duke_text <- duke_text %>%
mutate(category = sapply(text, classify_text, word_pair = duke_word_pair))
print(duke_text)
epfl_text <- subset(data_posts_content, id == "epfl.csv")
epfl_corpus <- corpus_maker(epfl_text)
epfl_clean_text <- cleanup(epfl_corpus, c("new", "amp", "can", "will", "one", "now", "–", "via", "portrait"))
Warning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documents
epfl_word_dictionary <- freq_terms(epfl_clean_text, 30)
epfl_word_category <- c("Image", "Research", "Image", "Research", "Research", "Research", "Education", "Image", "Research", "Research", "Education", "Research", NA, "Education", "Education")
epfl_word_pair <- pairing(epfl_word_dictionary, epfl_word_category)
print(epfl_word_pair)
# Applying the classify_text function to each text
epfl_text <- epfl_text %>%
mutate(category = sapply(text, classify_text, word_pair = epfl_word_pair))
print(epfl_text)
goe_text <- subset(data_posts_content, id == "goe.csv")
goe_corpus <- corpus_maker(goe_text)
goe_clean_text <- cleanup(goe_corpus, c("can", "new", "amp", "will"))
Warning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documents
goe_word_dictionary <- freq_terms(goe_clean_text, 15)
goe_word_category <- c("Research", "Research", "Image", "Education", "Engagement", "Image", "Engagement", "Education")
goe_word_pair <- pairing(goe_word_dictionary, goe_word_category)
print(goe_word_pair)
# Applying the classify_text function to each text
goe_text <- goe_text %>%
mutate(category = sapply(text, classify_text, word_pair = goe_word_pair))
print(goe_text)
harvard_text <- subset(data_posts_content, id == "harvard.csv")
harvard_corpus <- corpus_maker(harvard_text)
harvard_clean_text <- cleanup(harvard_corpus, c("new", "can", "will", "summer", "year", "first", "may", "-", "recent", "years", "one", "said", "time", "many", "world", "change"))
Warning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documents
harvard_word_dictionary <- freq_terms(harvard_clean_text, 50)
harvard_word_category <- c("Image", "Education", "Research", "Image", "Research", "Research", "Engagement", "Education", "Society", "Education", "Society", "Research", "Education", "Image", "Research", NA, "Research", "Education", "Education", NA, "Research", "Society", "Society", "Research", "Education")
harvard_word_pair <- pairing(harvard_word_dictionary, harvard_word_category)
print(harvard_word_pair)
harvard_text <- harvard_text %>%
mutate(category = sapply(text, classify_text, word_pair = harvard_word_pair))
print(harvard_text)
leicester_text <- subset(data_posts_content, id == "leicester.csv")
leicester_corpus <- corpus_maker(leicester_text)
leicester_clean_text <- cleanup(leicester_corpus, c("👉", "day", "new", "clear", "year", "can", "will", "time", "space", "one", "first"))
Warning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documents
leicester_word_dictionary <- freq_terms(leicester_clean_text, 60)
leicester_word_category <- c("Image", NA, "Image", "Engagement", "Society", "Education", "Education", "Research", "Engagement", "Engagement", "Image", "Engagement", "Education", "Engamement", "Engagement", "Education", "Image", "Engagement", "Education", NA, "Image")
leicester_word_pair <- pairing(leicester_word_dictionary, leicester_word_category)
print(leicester_word_pair)
leicester_text <- leicester_text %>%
mutate(category = sapply(text, classify_text, word_pair = leicester_word_pair))
print(leicester_text)
manchester_text <- subset(data_posts_content, id == "manchester.csv")
manchester_corpus <- corpus_maker(manchester_text)
manchester_clean_text <- cleanup(manchester_corpus, c("👇", "can", "just", "will", "get", "well", "one", "help", "now", "new", "read", "congratulations"))
Warning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documents
manchester_word_dictionary <- freq_terms(manchester_clean_text, 50)
manchester_word_category <- c(NA, "Image", "Engagement", "Image", "Education", "Education", "Engagement", "Image", "Education", "Society", "Research", "Research", "Education", "Education", "Education", "Engagement", "Research", "Education", "Research")
manchester_word_pair <- pairing(manchester_word_dictionary, manchester_word_category)
print(manchester_word_pair)
manchester_text <- manchester_text %>%
mutate(category = sapply(text, classify_text, word_pair = manchester_word_pair))
print(manchester_text)
mit_text <- subset(data_posts_content, id == "mit.csv")
mit_corpus <- corpus_maker(mit_text)
mit_clean_text <- cleanup(mit_corpus, c("new", "can", "says", "“", "’", "—", "", "may", "first", "will", "using", "way", "one", "science"))
Warning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documents
mit_word_dictionary <- freq_terms(mit_clean_text, 40)
mit_word_category <- c("Image", "Research", NA, "Image", NA, "Education", "Research", NA, "Education", "Research", "Research", "Education", "Research", "Research", "Image", "Education", "Education", "Research", "Research", "Research", "Research", "Research", "Research", "Research", "Research", "Society")
mit_word_pair <- pairing(mit_word_dictionary, mit_word_category)
print(mit_word_pair)
mit_text <- mit_text %>%
mutate(category = sapply(text, classify_text, word_pair = mit_word_pair))
print(mit_text)
sb_text <- subset(data_posts_content, id == "sb.csv")
sb_corpus <- corpus_maker(sb_text)
sb_clean_text <- cleanup(sb_corpus, c("new", "—", "will", "week", "can", "amp", "via", "now", "future", "first"))
Warning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documents
sb_word_dictionary <- freq_terms(sb_clean_text, 30)
sb_word_category <- c("Image", "Research", "Image", "Image", "Image", NA, "Research", "Image", "Image", "Research", "Education", "Image", "Education", "Image", "Education", "Image", "Society")
sb_word_pair <- pairing(sb_word_dictionary, sb_word_category)
print(sb_word_pair)
sb_text <- sb_text %>%
mutate(category = sapply(text, classify_text, word_pair = sb_word_pair))
print(sb_text)
stanford_text <- subset(data_posts_content, id == "stanford.csv")
stanford_corpus <- corpus_maker(stanford_text)
stanford_clean_text <- cleanup(stanford_corpus, c("new", "will"))
Warning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documents
stanford_word_dictionary <- freq_terms(stanford_clean_text, 25)
stanford_word_category <- c("Image", "Image", "Research", "Education", NA)
stanford_word_pair <- pairing(stanford_word_dictionary, stanford_word_category)
print(stanford_word_pair)
stanford_text <- stanford_text %>%
mutate(category = sapply(text, classify_text, word_pair = stanford_word_pair))
print(stanford_text)
trinity_text <- subset(data_posts_content, id == "trinity.csv")
trinity_corpus <- corpus_maker(trinity_text)
trinity_clean_text <- cleanup(trinity_corpus, c("amp", "read", "new", "can", "will", "week", "work", "great", "day", "visit", "irish", "first", "led", "congratulations"))
Warning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documents
trinity_word_dictionary <- freq_terms(trinity_clean_text, 40)
Warning: empty document(s): 754
trinity_word_category <- c("Image", "Education", "Research", "Image", "Research", "Image", "Research", "Society", "Society", "Education", "Engagement", "Engagement", "Education", "Image", "Education", "Image", "Research")
trinity_word_pair <- pairing(trinity_word_dictionary, trinity_word_category)
print(trinity_word_pair)
trinity_text <- trinity_text %>%
mutate(category = sapply(text, classify_text, word_pair = trinity_word_pair))
print(trinity_text)
wv_text <- subset(data_posts_content, id == "wv.csv")
wv_corpus <- corpus_maker(wv_text)
wv_clean_text <- cleanup(wv_corpus, c("💛💙", "🙌", "👉", "happy", "day", "see", "week", "great", "can", "will", "well", "know", "new", "now", "get", "just"))
Warning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documents
wv_word_dictionary <- freq_terms(wv_clean_text, 40)
wv_word_category <- c(NA, "Image", "Image", NA, "Image", "Image", "Engagement", "Education", "Education", "Education", NA)
wv_word_pair <- pairing(wv_word_dictionary, wv_word_category)
print(wv_word_pair)
wv_text <- wv_text %>%
mutate(category = sapply(text, classify_text, word_pair = wv_word_pair))
print(wv_text)
yale_text <- subset(data_posts_content, id == "yale.csv")
yale_corpus <- corpus_maker(yale_text)
yale_clean_text <- cleanup(yale_corpus, c("new", "—", "will", "can", "'", "first", "work", "read", "help", "year"))
Warning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documentsWarning: transformation drops documents
yale_word_dictionary <- freq_terms(yale_clean_text, 70)
yale_word_category <- c("Research", "Education", "Education", NA, "Research", "Research", "Research", "Research", "Education", "Image", "Research", "Education", "Image", "Research", NA, "Research", "Image", "Image", "Research", "Image", "Research", "Society", "Research", "Society", "Society")
yale_word_pair <- pairing(yale_word_dictionary, yale_word_category)
print(yale_word_pair)
yale_text <- yale_text %>%
mutate(category = sapply(text, classify_text, word_pair = yale_word_pair))
print(yale_text)
emotions_maker <- function(texts, hei_name){
text_emotion <- get_nrc_sentiment(texts)
# Proportions for text values
emotion_proportions <- colSums(prop.table(text_emotion[, 1:8]))
# Visualization of percentages on each emotion found on posts
barplot <- barplot(
sort(colSums(prop.table(text_emotion[, 1:8]))),
horiz = TRUE,
cex.names = 0.7,
las = 1,
main = sprintf("Emotions found in %s's texts", hei_name),
xlab="Percentage",
xlim = c(0, max(emotion_proportions) * 1.2)
)
text(
x = sort(emotion_proportions),
y = barplot,
labels = sprintf("%.2f%%", 100 * sort(emotion_proportions)),
pos = 4,
cex = 0.7
)
}
duke_text$sentiment <- round(get_sentiment(duke_text$text), 2)
print(duke_text)
emotions_maker(duke_text$text, "duke")
epfl_text$sentiment <- round(get_sentiment(epfl_text$text), 2)
print(epfl_text)
emotions_maker(epfl_text$text, "epfl")
goe_text$sentiment <- round(get_sentiment(goe_text$text), 2)
print(goe_text)
emotions_maker(goe_text$text, "goe")
harvard_text$sentiment <- round(get_sentiment(harvard_text$text), 2)
print(harvard_text)
emotions_maker(harvard_text$text, "harvard")
leicester_text$sentiment <- round(get_sentiment(leicester_text$text), 2)
print(leicester_text)
emotions_maker(leicester_text$text, "leicester")
manchester_text$sentiment <- round(get_sentiment(manchester_text$text), 2)
print(manchester_text)
emotions_maker(manchester_text$text, "manchester")
mit_text$sentiment <- round(get_sentiment(mit_text$text), 2)
print(mit_text)
emotions_maker(mit_text$text, "mit")
sb_text$sentiment <- round(get_sentiment(sb_text$text), 2)
print(sb_text)
emotions_maker(sb_text$text, "sb")
stanford_text$sentiment <- round(get_sentiment(stanford_text$text), 2)
print(stanford_text)
emotions_maker(stanford_text$text, "stanford")
trinity_text$sentiment <- round(get_sentiment(trinity_text$text), 2)
print(trinity_text)
emotions_maker(trinity_text$text, "trinity")
wv_text$sentiment <- round(get_sentiment(wv_text$text), 2)
print(wv_text)
emotions_maker(wv_text$text, "wv")
yale_text$sentiment <- round(get_sentiment(yale_text$text), 2)
print(yale_text)
emotions_maker(yale_text$text, "yale")
# Combine all the sentiment scores into one data frame
all_sentiments <- rbind(
duke_text[, c("id", "sentiment")],
epfl_text[, c("id", "sentiment")],
goe_text[, c("id", "sentiment")],
harvard_text[, c("id", "sentiment")],
leicester_text[, c("id", "sentiment")],
manchester_text[, c("id", "sentiment")],
mit_text[, c("id", "sentiment")],
sb_text[, c("id", "sentiment")],
stanford_text[, c("id", "sentiment")],
trinity_text[, c("id", "sentiment")],
wv_text[, c("id", "sentiment")],
yale_text[, c("id", "sentiment")]
)
# Calculate the average sentiment for each HEI
hei_average_sentiments <- aggregate(sentiment ~ id, data = all_sentiments, FUN = mean) %>%
rename(average_sentiment = sentiment)
# Print the table
print(hei_average_sentiments)
cosine_matrix_maker <- function(table){
numerical_columns <- table %>%
select(-id)
# Normalizing columns
normalized_columns <- as.data.frame(scale(numerical_columns))
# Transposing the data to compute similarity between columns
transposed_data <- t(normalized_columns)
# Computing the cosine similarity matrix
similarity_matrix <- as.matrix(proxy::dist(transposed_data, method = "cosine"))
# Converting distance to similarity
similarity_matrix <- 1 - similarity_matrix
# Print the similarity matrix
print(similarity_matrix)
}
# Joining attribute percentage_tweets (percentage of tweets out of all posts) and percentage_replies (percentage of replies out of all posts) from number_posts also adding unique_hashtags (number of unique hashtags) and hashtag_percentage (percentage of posts that contain a hashtag) from hashtags, per HEI
cluster_table <- merge(select(hashtags, id, unique_hashtags, hashtag_percentage), select(data_ratio, id, percentage_tweets, percentage_replies), by = "id", all=TRUE)
# Joining attribute avg_posts_per_days (average of posts per day) from posts_per_day per HEI
cluster_table <- merge(cluster_table, select(posts_per_day, id, avg_posts_per_days), by = "id", all=TRUE)
# Joining attribute avg_posts_per_weeks (average of posts per week) from posts_per_week per HEI
cluster_table <- merge(cluster_table, select(posts_per_week, id, avg_posts_per_weeks), by = "id", all=TRUE)
# Joining attribute avg_posts_in_academic_time (average of posts during academic time) from data_posts_academic per HEI
cluster_table <- merge(cluster_table, select(data_posts_academic, id, avg_posts_in_academic_time), by = "id", all=TRUE)
# Joining attribute avg_posts_in_vacation_time (average of posts during vacation time) from data_posts_vacations per HEI
cluster_table <- merge(cluster_table, select(data_posts_vacations, id, avg_posts_in_vacation_time), by = "id", all=TRUE)
# Joining attribute time_of_day_value (numerical value referring to time of day where every HEI made more posts) from favourite_hour_hei per HEI
cluster_table <- merge(cluster_table, select(favourite_hour_hei, id, time_of_day_value), by = "id", all=TRUE)
# Joining attribute url_percentage (percentage of posts that contain an url) from url_usage per HEI
cluster_table <- merge(cluster_table, select(url_usage, id, url_percentage), by = "id", all=TRUE)
# Joining attribute average_num_words (average number of words in the posts) from data_posts_content_metrics per HEI
cluster_table <- merge(cluster_table, select(data_posts_content_metrics, id, average_num_words), by = "id", all=TRUE)
# Joining attribute average_sentiment (average sentiment of posts) from hei_average_sentiments per HEI
cluster_table <- merge(cluster_table, select(hei_average_sentiments, id, average_sentiment), by = "id", all=TRUE)
print(cluster_table)
cosine_matrix_maker(cluster_table)
unique_hashtags hashtag_percentage percentage_tweets percentage_replies avg_posts_per_days avg_posts_per_weeks
unique_hashtags 1.00000000 0.714423178 0.16552198 -0.15047782 0.09423180 0.09460315
hashtag_percentage 0.71442318 1.000000000 0.13470812 -0.34565768 -0.01048208 -0.01015282
percentage_tweets 0.16552198 0.134708118 1.00000000 -0.73554060 0.30916774 0.30775916
percentage_replies -0.15047782 -0.345657681 -0.73554060 1.00000000 0.09388665 0.09480553
avg_posts_per_days 0.09423180 -0.010482084 0.30916774 0.09388665 1.00000000 0.99999797
avg_posts_per_weeks 0.09460315 -0.010152816 0.30775916 0.09480553 0.99999797 1.00000000
avg_posts_in_academic_time 0.12447464 -0.033819668 0.32273419 0.06234258 0.97881722 0.97877880
avg_posts_in_vacation_time 0.09368691 0.004006656 0.16186441 0.20825601 0.97718481 0.97729844
time_of_day_value -0.22814041 -0.175920221 0.30874190 -0.21552038 -0.05609489 -0.05648404
url_percentage 0.09757172 0.395409742 0.52625841 -0.84342954 0.06010475 0.05920162
average_num_words 0.46150077 0.713419420 0.06039969 -0.48123350 -0.36976005 -0.36993229
average_sentiment 0.59299320 0.645971142 -0.31588590 -0.04093998 -0.49647197 -0.49605792
avg_posts_in_academic_time avg_posts_in_vacation_time time_of_day_value url_percentage average_num_words average_sentiment
unique_hashtags 0.12447464 0.093686911 -0.22814041 0.09757172 0.46150077 0.59299320
hashtag_percentage -0.03381967 0.004006656 -0.17592022 0.39540974 0.71341942 0.64597114
percentage_tweets 0.32273419 0.161864406 0.30874190 0.52625841 0.06039969 -0.31588590
percentage_replies 0.06234258 0.208256011 -0.21552038 -0.84342954 -0.48123350 -0.04093998
avg_posts_per_days 0.97881722 0.977184814 -0.05609489 0.06010475 -0.36976005 -0.49647197
avg_posts_per_weeks 0.97877880 0.977298444 -0.05648404 0.05920162 -0.36993229 -0.49605792
avg_posts_in_academic_time 1.00000000 0.953796153 -0.14950356 0.07310820 -0.37164829 -0.49875096
avg_posts_in_vacation_time 0.95379615 1.000000000 -0.07107087 0.02168275 -0.32110128 -0.41192255
time_of_day_value -0.14950356 -0.071070873 1.00000000 -0.01251319 -0.10921047 -0.29227459
url_percentage 0.07310820 0.021682748 -0.01251319 1.00000000 0.63011343 0.17847188
average_num_words -0.37164829 -0.321101278 -0.10921047 0.63011343 1.00000000 0.84649940
average_sentiment -0.49875096 -0.411922553 -0.29227459 0.17847188 0.84649940 1.00000000
cluster_maker <- function(num_clusters, table){
# Excluding id column for clustering
cluster_data <- select(table, -id)
# Scaling the data for kmeans method
scaled_data <- scale(cluster_data)
kmeans_model <- kmeans(scaled_data, centers = num_clusters, nstart = 10)
# Extract cluster assignments
cluster_assignments <- kmeans_model$cluster
# Create a data frame combining original data with cluster assignments
clustered_data <- cbind(cluster_table$id, cluster_data, cluster = cluster_assignments)
clustered_data <- clustered_data[, c("cluster_table$id", "cluster")]
print(clustered_data)
}
elbow_maker <- function(table){
cluster_data <- select(table, -id)
scaled_data <- scale(cluster_data)
wss <- vector()
range <- 1:10
for (k in range) {
kmeans_model <- kmeans(scaled_data, centers = k, nstart = 10)
wss[k] <- kmeans_model$tot.withinss
}
elbow_df <- data.frame(k = range, WSS = wss)
ggplot(elbow_df, aes(x = k, y = WSS)) +
geom_line() +
geom_point() +
labs(x = "Number of Clusters", y = "Within-Cluster Sum of Squares (WCSS)",
title = "Elbow Method for Optimal k") +
theme_minimal()
}
elbow_maker(cluster_table)
cluster_maker(4, cluster_table)